# 加载lightgbm包
if (!require(lightgbm)) install.packages("lightgbm")
library(lightgbm)
library(caret)
library(pROC)
library(MatchIt)

# 设置工作目录并加载数据
setwd("${path}")
mydata <- read.csv("data.csv")

# 确保 group_best 是因子，并且值为 "event_0" 和 "event_1"
mydata$group_best <- factor(mydata$group_best, levels = c("0", "1"), labels = c("event_0", "event_1"))


# 使用 matchit 进行 1:3 匹配
m.out <- matchit(group_best ~ ${independent_and}, 
                 data = mydata, 
                 method = "nearest", 
                 ratio = 3)

# 提取匹配后的数据
matched_data <- match.data(m.out)
# 设置交叉验证控制参数（LightGBM有自己的cross validation参数）
params <- list(objective = "binary", metric = "auc")

# 训练LightGBM模型
dtrain <- lgb.Dataset(data = as.matrix(matched_data[, c(cols)]), label = as.numeric(matched_data$group_best) - 1)
model_lightgbm <- lgb.train(params, dtrain, 100)

# 预测和评估
predictions_lightgbm <- predict(model_lightgbm, newdata = as.matrix(matched_data[, c(cols)]))
predicted_probs_lightgbm <- predictions_lightgbm
predicted_classes_lightgbm <- factor(ifelse(predicted_probs_lightgbm >= 0.5, "event_1", "event_0"), levels = c("event_0", "event_1"))

# 添加预测结果到数据集
mydata_with_predictions_lightgbm <- matched_data %>%
  mutate(
    group_best_probability = predicted_probs_lightgbm,
    group_best_predicted_class = predicted_classes_lightgbm,
    group_best_score = NA  # LightGBM没有评分卡
  )
# 计算 group_best 和 group_best_predicted_class 相同的数据数量
matching_rows <- mydata_with_predictions_lightgbm %>%
  filter(group_best == group_best_predicted_class) %>%
  nrow()
# 打印结果
cat("Number of rows where group_best and group_best_predicted_class match:", matching_rows, "\n")
# 可选：计算准确率（匹配行数 / 总行数）
total_rows <- nrow(matched_data)
accuracy <- matching_rows / total_rows
# 打印结果
cat("Number of rows where group_best and group_best_predicted_class match:", accuracy, "\n")
# 这是建模集的结果
write.csv(mydata_with_predictions_lightgbm, file = "mydata_with_predictions_lightgbm.csv", row.names = FALSE)
#### 特征重要性图 ####
library(ggplot2)


# 计算特征重要性
importance <- lgb.importance(model_lightgbm)

# 将特征重要性转换为数据框并排序
importance_df <- data.frame(
  Feature = as.character(importance$Feature),
  Gain = importance$Gain,
  Cover = importance$Cover,
  Frequency = importance$Frequency,
  stringsAsFactors = FALSE
)

# 按照Gain降序排序
importance_df <- importance_df[order(-importance_df$Gain), ]
Cairo::CairoTIFF(file="importanceFeatures_lightGBM.tiff", width=800, height=800,units="in",dpi=150)
# 绘制特征重要性图（以Gain为例）
ggplot(importance_df, aes(x=reorder(Feature, Gain), y=Gain)) +
  geom_bar(stat="identity", fill="steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title="Feature Importance Plot (by Gain)",
       x="Features",
       y="Gain Score") +
  theme(axis.text.y = element_text(size = 8))
dev.off()
#### 混淆矩阵 ####
# 混淆矩阵
conf_matrix <- confusionMatrix(data = predicted_classes_lightgbm, reference = mydata$group_best)

sink("confusionMatrix_lightGBM.txt")
print(conf_matrix)
sink()
#### ROC曲线 ####
# ROC曲线
Cairo::CairoTIFF(file="importanceFeatures_lightGBM.tiff", width=800, height=800,units="in",dpi=150)

roc_curve <- roc(matched_data$group_best, predictions_lightgbm, plot=TRUE, print.auc=TRUE, col="darkgreen", lwd=2, main="ROC Curve")
dev.off()
#### 校准图 ####
# 计算实际比例与预测概率的关系
calib_data <- mydata_with_predictions_lightgbm %>%
  mutate(bin = ntile(group_best_probability, 10)) %>% # 将预测概率分为10个区间
  group_by(bin) %>%
  summarise(
    mean_pred = mean(group_best_probability, na.rm = TRUE), # 每个区间的平均预测概率
    actual_rate = mean(as.numeric(group_best) - 1, na.rm = TRUE),    # 每个区间的真实发生率
    .groups = 'drop'
  )

# 绘制校准图
Cairo::CairoTIFF(file="cal_lightGBM.tiff", width=800, height=800,units="in",dpi=150)

ggplot(calib_data, aes(x=mean_pred, y=actual_rate)) +
  geom_point(size=3) + # 绘制点
  geom_line(linetype="dashed", size=1) + # 连接点
  geom_abline(intercept=0, slope=1, linetype="solid", color="red") + # 理想校准线
  labs(title="Calibration Plot for LightGBM Model",
       x="Predicted Probability",
       y="Actual Rate") +
  theme_minimal()
dev.off()
mydata1 <- read.csv("result.csv")
# 预测和评估
predictions_lightgbm <- predict(model_lightgbm, newdata = as.matrix(mydata1[, c(cols)]))
predicted_probs_lightgbm <- predictions_lightgbm
predicted_classes_lightgbm <- factor(ifelse(predicted_probs_lightgbm >= 0.5, "event_1", "event_0"), levels = c("event_0", "event_1"))
# 添加预测结果到数据集
result_with_predictions_lightgbm <- mydata1 %>%
  mutate(
    group_best_probability = predicted_probs_lightgbm,
    group_best_predicted_class = predicted_classes_lightgbm,
    group_best_score = NA  # LightGBM没有评分卡
  )
# 这是结局的结果
write.csv(result_with_predictions_lightgbm, file = "result_with_predictions_lightgbm.csv", row.names = FALSE)
